LightGBM Models Training

WIDS 2024 Challenge ++

BMI 212 - Team DMMTS

In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn import preprocessing
pd.set_option("mode.copy_on_write", True)

Load the data

In [2]:
# Load the CSV dataset
data_df = pd.read_csv('./Data/train_test_added_climate_data_imputed.csv')

# Rename feature columns for better readability
data_df.rename(columns={'bmi': 'patient_bmi',
                   'region': 'patient_region',
                   'division': 'patient_division',
                   'side': 'patient_tumor_side',
                   'quadrant': 'patient_tumor_quadrant',
                   'metastatic_organ': 'patient_metastatic_organ',
                   'cleaned_metastatic_first_treatment': 'patient_metastatic_first_treatment',
                   'cleaned_metastatic_first_treatment_type': 'patient_metastatic_first_treatment_type',
                   'population': 'population_size',
                   'density': 'population_density',
                   'age_median': 'population_age_median',
                   'female': 'population_female_perc',
                   'married': 'population_married_perc',
                   'divorced': 'population_divorced_perc',
                   'never_married': 'population_never_married_perc',
                   'widowed': 'population_widowed_perc',
                   'family_size': 'population_family_size',
                   'family_dual_income': 'population_family_dual_income_perc',
                   'income_individual_median': 'population_income_individual_median',
                   'income_household_median': 'population_income_household_median',
                   'home_ownership': 'population_home_ownership_perc',
                   'home_value': 'population_home_value',
                   'rent_median': 'population_rent_median',
                   'rent_burden': 'population_rent_burden_perc',
                   'education_less_highschool': 'population_education_less_highschool_perc',
                   'education_highschool': 'population_education_highschool_perc',
                   'education_some_college': 'population_education_some_college_perc',
                   'education_bachelors': 'population_education_bachelors_perc',
                   'education_graduate': 'population_education_graduate_perc',
                   'education_college_or_above': 'population_education_college_or_above_perc',
                   'education_stem_degree': 'population_education_stem_degree_perc',
                   'unemployment_rate': 'population_unemployment_rate',
                   'self_employed': 'population_self_employed_perc',
                   'farmer': 'population_farmer_perc',
                   'race_white': 'population_race_white_perc',
                   'race_black': 'population_race_black_perc',
                   'race_asian': 'population_race_asian_perc',
                   'race_native': 'population_race_native_american_perc',
                   'race_pacific': 'population_race_pacific_islander_perc',
                   'race_other': 'population_race_other_perc',
                   'race_multiple': 'population_race_multiple_perc',
                   'hispanic': 'population_hispanic_perc',
                   'disabled': 'population_disabled_perc',
                   'poverty': 'population_poverty_perc',
                   'limited_english': 'population_limited_english_perc',
                   'commute_time': 'population_commute_time',
                   'health_uninsured': 'population_health_uninsured_perc',
                   'veteran': 'population_veteran_perc',
                    'climate_ozone': 'annual_ozone_conc',
                    'climate_pm25': 'annual_fine_particulate_matter_conc',
                    'climate_n02': 'annual_nitrogen_dioxide_conc'
                   }, inplace=True)

Select the features to use

In [3]:
features = ['patient_race', 'payer_type', 'patient_state', 'patient_age', 'patient_gender', 'patient_bmi',
            'patient_region', 'patient_division', 'patient_tumor_side', 'patient_tumor_quadrant',
            'patient_metastatic_organ', 'patient_metastatic_first_treatment', 'patient_metastatic_first_treatment_type',
            'population_size', 'population_density', 'population_age_median', 'population_female_perc',
            'population_married_perc', 'population_divorced_perc', 'population_never_married_perc',
            'population_widowed_perc', 'population_family_size', 'population_family_dual_income_perc',
            'population_income_individual_median', 'population_income_household_median', 'population_home_ownership_perc',
            'population_home_value', 'population_rent_median', 'population_rent_burden_perc',
            'population_education_less_highschool_perc', 'population_education_highschool_perc',
            'population_education_some_college_perc', 'population_education_bachelors_perc',
            'population_education_graduate_perc', 'population_education_college_or_above_perc',
            'population_education_stem_degree_perc', 'population_unemployment_rate', 'population_self_employed_perc',
            'population_farmer_perc', 'population_race_white_perc', 'population_race_black_perc',
            'population_race_asian_perc', 'population_race_native_american_perc', 'population_race_pacific_islander_perc',
            'population_race_other_perc', 'population_race_multiple_perc', 'population_hispanic_perc',
            'population_disabled_perc', 'population_poverty_perc', 'population_limited_english_perc',
            'population_commute_time', 'population_health_uninsured_perc', 'population_veteran_perc', 'annual_nitrogen_dioxide_conc',
            'annual_fine_particulate_matter_conc', 'annual_ozone_conc']

# Select only rows where allocated_set is train
train_df = data_df[data_df['allocated_set'] == 'train']
train_df.head()

# Select the features to use
features_df = train_df[features]

# Extract labels for time to treatment 
labels_df = train_df[['treatment_pd']]
In [4]:
features_df['patient_bmi'].value_counts()
Out[4]:
patient_bmi
27.00    763
40.00    701
24.00    511
26.00    495
23.00    472
        ... 
38.07      1
39.83      1
33.04      1
20.86      1
43.68      1
Name: count, Length: 1935, dtype: int64

Define categorical variables

In [5]:
# Convert object features to categorical
for col in features_df.select_dtypes(include='object').columns:
    features_df[col] = features_df[col].astype('category')

# List of categorical features
categorical_features = list(features_df.select_dtypes(include='category').columns)
print(categorical_features)
['patient_race', 'payer_type', 'patient_state', 'patient_region', 'patient_division', 'patient_tumor_side', 'patient_tumor_quadrant', 'patient_metastatic_organ', 'patient_metastatic_first_treatment', 'patient_metastatic_first_treatment_type']
In [6]:
# Note: Significant portion of race data is missing! Double-check that they have been imputed.
features_df['patient_race'].value_counts(dropna=False)
Out[6]:
patient_race
White       14577
Black        4713
Other        3917
Hispanic     3173
Asian        1294
Name: count, dtype: int64

Drop low-variance features

In [7]:
# Temporarily convert categorical features to distinct numerical codes, keeping missing/NaN values
temp_feature_df = features_df.copy()
for cat_feat in categorical_features:
    temp_feature_df[cat_feat] = temp_feature_df[cat_feat].cat.codes
    temp_feature_df.loc[temp_feature_df[cat_feat] == -1] = np.NaN
In [8]:
temp_feature_df.head()
Out[8]:
patient_race payer_type patient_state patient_age patient_gender patient_bmi patient_region patient_division patient_tumor_side patient_tumor_quadrant ... population_hispanic_perc population_disabled_perc population_poverty_perc population_limited_english_perc population_commute_time population_health_uninsured_perc population_veteran_perc annual_nitrogen_dioxide_conc annual_fine_particulate_matter_conc annual_ozone_conc
0 0 0 4 46 False 27.00 3 5 1 7 ... 47.726087 9.895652 10.515217 12.745652 32.530435 7.263043 3.810870 20.084231 9.595719 47.310325
1 4 2 35 63 False 27.82 0 0 0 3 ... 1.182979 18.317021 13.546809 0.146809 31.890909 7.631915 9.631915 12.485563 8.399666 40.544389
2 2 0 44 53 False 27.12 2 8 0 2 ... 21.064151 14.083019 11.943396 2.549057 32.556250 16.396226 10.392453 11.522807 9.361774 38.696786
3 2 1 4 50 False 26.42 3 5 2 8 ... 37.948485 8.957576 10.109091 8.057576 30.606061 7.018182 4.103030 20.113179 8.487175 42.301121
4 3 0 34 39 False 18.00 1 2 2 8 ... 18.960526 10.194737 18.642105 14.173684 42.502632 6.392105 1.755263 27.496367 7.879795 37.722740

5 rows × 56 columns

In [11]:
from sklearn.feature_selection import VarianceThreshold

# Remove low-var features from temp df (will drop removed cols from original df)
# Ref: https://scikit-learn.org/stable/modules/feature_selection.html
low_var_filter = VarianceThreshold(threshold=(.9 * (1 - .9)))
filtered_features = low_var_filter.fit_transform(temp_feature_df)
filtered_feature_names = low_var_filter.get_feature_names_out(input_features=features)
print("Low-variance features: ", set(features) - set(filtered_feature_names))

filtered_features_df = features_df[filtered_feature_names].copy()
Low-variance features:  {'patient_gender', 'population_family_size', 'patient_metastatic_first_treatment_type'}
In [10]:
filtered_features_df['patient_bmi'].value_counts()
Out[10]:
patient_bmi
27.00    763
40.00    701
24.00    511
26.00    495
23.00    472
        ... 
38.07      1
39.83      1
33.04      1
20.86      1
43.68      1
Name: count, Length: 1935, dtype: int64

Remove features with >60% missing data

In [111]:
dropped_cols = []
for col in filtered_features_df.columns:
    percent_missing = filtered_features_df[col].isnull().sum() / len(filtered_features_df[col])
    if percent_missing > 0.6:
        dropped_cols.append(col)
        filtered_features_df.drop(columns=[col], inplace=True)
print("Dropped features: ", dropped_cols)
Dropped features:  []

Impute data

In [112]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
In [113]:
filtered_features_df['patient_race'].value_counts(normalize=True, dropna=False)
Out[113]:
patient_race
White       0.526740
Black       0.170304
Other       0.141541
Hispanic    0.114656
Asian       0.046759
Name: proportion, dtype: float64
In [114]:
# Identify numerical features 
numerical_features = list(filtered_features_df.select_dtypes(include='number').columns)

# Impute numerical features (estimates each feature from others)
imputer = IterativeImputer()
imputed_num_data = imputer.fit_transform(filtered_features_df[numerical_features])
imputed_num_df = pd.DataFrame(imputed_num_data, columns=numerical_features)
imputed_num_df.head()
Out[114]:
patient_age patient_bmi population_size population_density population_age_median population_female_perc population_married_perc population_divorced_perc population_never_married_perc population_widowed_perc ... population_hispanic_perc population_disabled_perc population_poverty_perc population_limited_english_perc population_commute_time population_health_uninsured_perc population_veteran_perc annual_nitrogen_dioxide_conc annual_fine_particulate_matter_conc annual_ozone_conc
0 46.0 27.00 43031.0 2048.578261 38.852174 50.947826 48.504348 10.117391 36.408696 4.969565 ... 47.726087 9.895652 10.515217 12.745652 32.530435 7.263043 3.810870 20.084231 9.595719 47.310325
1 63.0 27.82 7228.0 194.656250 41.247826 47.908511 50.672340 14.102128 27.117021 8.112766 ... 1.182979 18.317021 13.546809 0.146809 31.890909 7.631915 9.631915 12.485563 8.399666 40.544389
2 53.0 27.12 24751.0 352.226786 41.371154 50.645283 52.994340 13.341509 25.094340 8.579245 ... 21.064151 14.083019 11.943396 2.549057 32.556250 16.396226 10.392453 11.522807 9.361774 38.696786
3 50.0 26.42 39122.0 2295.939394 38.200000 50.106061 50.245455 9.827273 35.290909 4.651515 ... 37.948485 8.957576 10.109091 8.057576 30.606061 7.018182 4.103030 20.113179 8.487175 42.301121
4 39.0 18.00 71374.0 17326.407890 36.476316 52.331579 39.923684 10.239474 44.642105 5.186842 ... 18.960526 10.194737 18.642105 14.173684 42.502632 6.392105 1.755263 27.496367 7.879795 37.722740

5 rows × 44 columns

In [115]:
# Set numeric columns in filtered features df to imputed columns
for col in numerical_features:
    filtered_features_df.loc[:, col] = imputed_num_df[col]
filtered_features_df.head()
Out[115]:
patient_race payer_type patient_state patient_age patient_bmi patient_region patient_division patient_tumor_side patient_tumor_quadrant patient_metastatic_organ ... population_hispanic_perc population_disabled_perc population_poverty_perc population_limited_english_perc population_commute_time population_health_uninsured_perc population_veteran_perc annual_nitrogen_dioxide_conc annual_fine_particulate_matter_conc annual_ozone_conc
0 Asian COMMERCIAL CA 46 27.00 West Pacific R overlap lymph node, unspecified ... 47.726087 9.895652 10.515217 12.745652 32.530435 7.263043 3.810870 20.084231 9.595719 47.310325
1 White MEDICARE ADVANTAGE OH 63 27.82 Midwest East North Central L UO bone ... 1.182979 18.317021 13.546809 0.146809 31.890909 7.631915 9.631915 12.485563 8.399666 40.544389
2 Hispanic COMMERCIAL TX 53 27.12 South West South Central L UI axilla and upper limb lymph nodes ... 21.064151 14.083019 11.943396 2.549057 32.556250 16.396226 10.392453 11.522807 9.361774 38.696786
3 Hispanic MEDICAID CA 50 26.42 West Pacific unspecified unspecified liver ... 37.948485 8.957576 10.109091 8.057576 30.606061 7.018182 4.103030 20.113179 8.487175 42.301121
4 Other COMMERCIAL NY 39 18.00 Northeast Middle Atlantic unspecified unspecified unspecified ... 18.960526 10.194737 18.642105 14.173684 42.502632 6.392105 1.755263 27.496367 7.879795 37.722740

5 rows × 53 columns

Split into train/test 80/20¶

In [116]:
# Rename dfs of features and target
X = filtered_features_df
y = labels_df

30-day labels

In [117]:
# Create labels for >30 days time to treatment
labels_30 = labels_df.copy()
labels_30.loc[labels_30['treatment_pd'] > 30, 'label'] = 1 
labels_30.loc[labels_30['treatment_pd'] <= 30, 'label'] = 0 
labels_30.head()
Out[117]:
treatment_pd label
0 35.0 1.0
1 33.0 1.0
2 24.0 0.0
3 455.0 1.0
4 162.0 1.0
In [118]:
y_30 = labels_30[['label']]

# Split the data into train/test split (80/20)
X_train_30, X_test_30, y_train_30, y_test_30 = train_test_split(X, y_30, test_size=0.2, random_state=123)

60-days labels

In [119]:
# Create labels for >30 days time to treatment
labels_60 = labels_df.copy()
labels_60.loc[labels_60['treatment_pd'] > 60, 'label'] = 1 
labels_60.loc[labels_60['treatment_pd'] <= 60, 'label'] = 0 
labels_60.head()
Out[119]:
treatment_pd label
0 35.0 0.0
1 33.0 0.0
2 24.0 0.0
3 455.0 1.0
4 162.0 1.0
In [120]:
y_60 = labels_60[['label']]

# Split the data into train/test split (80/20)
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(X, y_60, test_size=0.2, random_state=123)

90-days labels

In [121]:
# Create labels for >90 days time to treatment
labels_90 = labels_df.copy()
labels_90.loc[labels_90['treatment_pd'] > 90, 'label'] = 1 
labels_90.loc[labels_90['treatment_pd'] <= 90, 'label'] = 0 
labels_90.head()
Out[121]:
treatment_pd label
0 35.0 0.0
1 33.0 0.0
2 24.0 0.0
3 455.0 1.0
4 162.0 1.0
In [122]:
y_90 = labels_90[['label']]

# Split the data into train/test split (80/20)
X_train_90, X_test_90, y_train_90, y_test_90 = train_test_split(X, y_90, test_size=0.2, random_state=123)

Prep LightGBM dataset¶

30-day classification

In [123]:
# Create the LightGBM dataset
train_data_30 = lgb.Dataset(X_train_30, label=y_train_30, feature_name='auto', categorical_feature=categorical_features)
test_data_30 = lgb.Dataset(X_test_30, label=y_test_30, feature_name='auto', categorical_feature=categorical_features)

# Define the hyperparameters for a regression model
params = {
    'objective': ['binary'],
    'metric': ['rmse'], 
    'is_unbalanced': [True],
    'boosting_type': ['gbdt'], 
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'num_leaves': [31, 61],
    'learning_rate': [0.1],
    'feature_fraction': [0.9],
    'bagging_fraction': [0.8],
    'bagging_freq': [5],
    'verbose': [-1],
    'verbosity': [-1],
}

# Train the model
model_30 = lgb.LGBMClassifier()

# Create the grid search
grid_30 = GridSearchCV(model_30, params, cv=10, scoring='roc_auc')

# Fit the model to the data
grid_30.fit(X_train_30, y_train_30.values.ravel())
Out[123]:
GridSearchCV(cv=10, estimator=LGBMClassifier(),
             param_grid={'bagging_fraction': [0.8], 'bagging_freq': [5],
                         'boosting_type': ['gbdt'], 'feature_fraction': [0.9],
                         'is_unbalanced': [True], 'learning_rate': [0.1],
                         'metric': ['rmse'],
                         'n_estimators': [50, 100, 200, 300, 400, 500],
                         'num_leaves': [31, 61], 'objective': ['binary'],
                         'verbose': [-1], 'verbosity': [-1]},
             scoring='roc_auc')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=10, estimator=LGBMClassifier(),
             param_grid={'bagging_fraction': [0.8], 'bagging_freq': [5],
                         'boosting_type': ['gbdt'], 'feature_fraction': [0.9],
                         'is_unbalanced': [True], 'learning_rate': [0.1],
                         'metric': ['rmse'],
                         'n_estimators': [50, 100, 200, 300, 400, 500],
                         'num_leaves': [31, 61], 'objective': ['binary'],
                         'verbose': [-1], 'verbosity': [-1]},
             scoring='roc_auc')
LGBMClassifier()
LGBMClassifier()
In [124]:
# Print best parameters
print('Best parameters:', grid_30.best_params_)

# Print best score
print('Best score:', grid_30.best_score_)

# Re-train LGB model with best params
best_lgbmodel_30 = lgb.LGBMClassifier(**grid_30.best_params_)
best_lgbmodel_30.fit(X_train_30, y_train_30.values.ravel())
y_test_pred_30 = best_lgbmodel_30.predict(X_test_30)

# Print RMSE on test set
print('RMSE:', root_mean_squared_error(y_true=y_test_30, y_pred=y_test_pred_30))
Best parameters: {'bagging_fraction': 0.8, 'bagging_freq': 5, 'boosting_type': 'gbdt', 'feature_fraction': 0.9, 'is_unbalanced': True, 'learning_rate': 0.1, 'metric': 'rmse', 'n_estimators': 50, 'num_leaves': 31, 'objective': 'binary', 'verbose': -1, 'verbosity': -1}
Best score: 0.6943078278204358
RMSE: 0.45759672268742574
In [125]:
# Alternate (original) form 

# Test the model
y_test_pred_OG_30 = grid_30.predict(X_test_30)

# Print RMSE on test set
print('RMSE:', root_mean_squared_error(y_true=y_test_30, y_pred=y_test_pred_OG_30))
RMSE: 0.45759672268742574

60-day classification

In [126]:
# Create the LightGBM dataset
train_data_60 = lgb.Dataset(X_train_60, label=y_train_60, feature_name='auto', categorical_feature=categorical_features)
test_data_60 = lgb.Dataset(X_test_60, label=y_test_60, feature_name='auto', categorical_feature=categorical_features)

# Define the hyperparameters for a regression model
params = {
    'objective': ['binary'],
    'metric': ['rmse'], 
    'is_unbalanced': [True],
    'boosting_type': ['gbdt'], 
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'num_leaves': [31, 61],
    'learning_rate': [0.1],
    'feature_fraction': [0.9],
    'bagging_fraction': [0.8],
    'bagging_freq': [5],
    'verbose': [-1],
    'verbosity': [-1],
}

# Train the model
model_60 = lgb.LGBMClassifier()

# Create the grid search
grid_60 = GridSearchCV(model_60, params, cv=10, scoring='roc_auc')

# Fit the model to the data
grid_60.fit(X_train_60, y_train_60.values.ravel())
Out[126]:
GridSearchCV(cv=10, estimator=LGBMClassifier(),
             param_grid={'bagging_fraction': [0.8], 'bagging_freq': [5],
                         'boosting_type': ['gbdt'], 'feature_fraction': [0.9],
                         'is_unbalanced': [True], 'learning_rate': [0.1],
                         'metric': ['rmse'],
                         'n_estimators': [50, 100, 200, 300, 400, 500],
                         'num_leaves': [31, 61], 'objective': ['binary'],
                         'verbose': [-1], 'verbosity': [-1]},
             scoring='roc_auc')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=10, estimator=LGBMClassifier(),
             param_grid={'bagging_fraction': [0.8], 'bagging_freq': [5],
                         'boosting_type': ['gbdt'], 'feature_fraction': [0.9],
                         'is_unbalanced': [True], 'learning_rate': [0.1],
                         'metric': ['rmse'],
                         'n_estimators': [50, 100, 200, 300, 400, 500],
                         'num_leaves': [31, 61], 'objective': ['binary'],
                         'verbose': [-1], 'verbosity': [-1]},
             scoring='roc_auc')
LGBMClassifier()
LGBMClassifier()
In [127]:
# Print best parameters
print('Best parameters:', grid_60.best_params_)

# Print best score
print('Best score:', grid_60.best_score_)

# Re-train LGB model with best params
best_lgbmodel_60 = lgb.LGBMClassifier(**grid_60.best_params_)
best_lgbmodel_60.fit(X_train_60, y_train_60.values.ravel())
y_test_pred_60 = best_lgbmodel_60.predict(X_test_60)

# Print RMSE on test set
print('RMSE:', root_mean_squared_error(y_true=y_test_60, y_pred=y_test_pred_60))
Best parameters: {'bagging_fraction': 0.8, 'bagging_freq': 5, 'boosting_type': 'gbdt', 'feature_fraction': 0.9, 'is_unbalanced': True, 'learning_rate': 0.1, 'metric': 'rmse', 'n_estimators': 50, 'num_leaves': 31, 'objective': 'binary', 'verbose': -1, 'verbosity': -1}
Best score: 0.7211502089933228
RMSE: 0.5806267056153314
In [128]:
# Alternate (original) form 

# Test the model
y_test_pred_OG_60 = grid_60.predict(X_test_60)

# Print RMSE on test set
print('RMSE:', root_mean_squared_error(y_true=y_test_60, y_pred=y_test_pred_OG_60))
RMSE: 0.5806267056153314

90-day classification

In [129]:
# Create the LightGBM dataset
train_data_90 = lgb.Dataset(X_train_90, label=y_train_90, feature_name='auto', categorical_feature=categorical_features)
test_data_90 = lgb.Dataset(X_test_90, label=y_test_90, feature_name='auto', categorical_feature=categorical_features)

# Define the hyperparameters for a regression model
params = {
    'objective': ['binary'],
    'metric': ['rmse'], 
    'is_unbalanced': [True],
    'boosting_type': ['gbdt'], 
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'num_leaves': [31, 61],
    'learning_rate': [0.1],
    'feature_fraction': [0.9],
    'bagging_fraction': [0.8],
    'bagging_freq': [5],
    'verbose': [-1],
    'verbosity': [-1],
}

# Train the model
model_90 = lgb.LGBMClassifier()

# Create the grid search
grid_90 = GridSearchCV(model_90, params, cv=10, scoring='roc_auc')

# Fit the model to the data
grid_90.fit(X_train_90, y_train_90.values.ravel())
Out[129]:
GridSearchCV(cv=10, estimator=LGBMClassifier(),
             param_grid={'bagging_fraction': [0.8], 'bagging_freq': [5],
                         'boosting_type': ['gbdt'], 'feature_fraction': [0.9],
                         'is_unbalanced': [True], 'learning_rate': [0.1],
                         'metric': ['rmse'],
                         'n_estimators': [50, 100, 200, 300, 400, 500],
                         'num_leaves': [31, 61], 'objective': ['binary'],
                         'verbose': [-1], 'verbosity': [-1]},
             scoring='roc_auc')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=10, estimator=LGBMClassifier(),
             param_grid={'bagging_fraction': [0.8], 'bagging_freq': [5],
                         'boosting_type': ['gbdt'], 'feature_fraction': [0.9],
                         'is_unbalanced': [True], 'learning_rate': [0.1],
                         'metric': ['rmse'],
                         'n_estimators': [50, 100, 200, 300, 400, 500],
                         'num_leaves': [31, 61], 'objective': ['binary'],
                         'verbose': [-1], 'verbosity': [-1]},
             scoring='roc_auc')
LGBMClassifier()
LGBMClassifier()
In [130]:
# Print best parameters
print('Best parameters:', grid_90.best_params_)

# Print best score
print('Best score:', grid_90.best_score_)

# Re-train LGB model with best params
best_lgbmodel_90 = lgb.LGBMClassifier(**grid_90.best_params_)
best_lgbmodel_90.fit(X_train_90, y_train_90.values.ravel())
y_test_pred_90 = best_lgbmodel_90.predict(X_test_90)

# Print RMSE on test set
print('RMSE:', root_mean_squared_error(y_true=y_test_90, y_pred=y_test_pred_90))
Best parameters: {'bagging_fraction': 0.8, 'bagging_freq': 5, 'boosting_type': 'gbdt', 'feature_fraction': 0.9, 'is_unbalanced': True, 'learning_rate': 0.1, 'metric': 'rmse', 'n_estimators': 50, 'num_leaves': 31, 'objective': 'binary', 'verbose': -1, 'verbosity': -1}
Best score: 0.7608147382642525
RMSE: 0.5091750772173156
In [131]:
# Alternate (original) form 

# Test the model
y_test_pred_OG_90 = grid_90.predict(X_test_90)

# Print RMSE on test set
print('RMSE:', root_mean_squared_error(y_true=y_test_90, y_pred=y_test_pred_OG_90))
RMSE: 0.5091750772173156

Lasso/Lin Reg/Random Forest¶

In [132]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
In [133]:
# Note: Looks like we need to on-hot-encode categorical features for RandomForestRegressor to work
# 30-day train/test sets
onehot_feat_train_30 = pd.get_dummies(X_train_30, dummy_na = False)
onehot_feat_test_30 = pd.get_dummies(X_test_30, dummy_na = False)

# 60-day train/test sets
onehot_feat_train_60 = pd.get_dummies(X_train_60, dummy_na = False)
onehot_feat_test_60 = pd.get_dummies(X_test_60, dummy_na = False)

# 90-day train/test sets
onehot_feat_train_90 = pd.get_dummies(X_train_90, dummy_na = False)
onehot_feat_test_90 = pd.get_dummies(X_test_90, dummy_na = False)

Random Forest 30-days classification

In [134]:
rand_forest_30 = RandomForestClassifier()

# Fit the model to the data
rand_forest_30.fit(onehot_feat_train_30, y_train_30.values.ravel())

# Test the model
y_test_pred_rf_30 = rand_forest_30.predict(onehot_feat_test_30)

# Print RMSE on test set
print('RMSE:', root_mean_squared_error(y_true=y_test_30, y_pred=y_test_pred_rf_30))

# Pring ROC AUC 
print('ROCAUC:', roc_auc_score(y_true=y_test_30, y_score=y_test_pred_rf_30))
RMSE: 0.47121285397245993
ROCAUC: 0.5261405451170666

Random Forest 60-days classification

In [135]:
rand_forest_60 = RandomForestClassifier()

# Fit the model to the data
rand_forest_60.fit(onehot_feat_train_60, y_train_60.values.ravel())

# Test the model
y_test_pred_rf_60 = rand_forest_60.predict(onehot_feat_test_60)

# Print RMSE on test set
print('RMSE:', root_mean_squared_error(y_true=y_test_60, y_pred=y_test_pred_rf_60))

# Pring ROC AUC 
print('ROCAUC:', roc_auc_score(y_true=y_test_60, y_score=y_test_pred_rf_60))
RMSE: 0.607243241304675
ROCAUC: 0.6187888198757765

Random Forest 90-days classification

In [136]:
rand_forest_90 = RandomForestClassifier()

# Fit the model to the data
rand_forest_90.fit(onehot_feat_train_90, y_train_90.values.ravel())

# Test the model
y_test_pred_rf_90 = rand_forest_90.predict(onehot_feat_test_90)

# Print RMSE on test set
print('RMSE:', root_mean_squared_error(y_true=y_test_90, y_pred=y_test_pred_rf_90))

# Pring ROC AUC 
print('ROCAUC:', roc_auc_score(y_true=y_test_90, y_score=y_test_pred_rf_90))
RMSE: 0.5352944972025405
ROCAUC: 0.6751437921817703

Linear Regression 30-day classification

In [137]:
lin_reg_30 = LinearRegression()

# Fit the model to the data
lin_reg_30.fit(onehot_feat_train_30, y_train_30.values.ravel())

# Test the model
y_test_pred_lin_reg_30 = lin_reg_30.predict(onehot_feat_test_30)

# Print RMSE on test set
print('30-day RMSE:', root_mean_squared_error(y_true=y_test_30, y_pred=y_test_pred_lin_reg_30))

# Pring ROC AUC 
print('30-day ROCAUC:', roc_auc_score(y_true=y_test_30, y_score=y_test_pred_lin_reg_30))
30-day RMSE: 0.3920796690482983
30-day ROCAUC: 0.694683730597767

Linear Regression 60-day classification

In [138]:
lin_reg_60 = LinearRegression()

# Fit the model to the data
lin_reg_60.fit(onehot_feat_train_60, y_train_60.values.ravel())

# Test the model
y_test_pred_lin_reg_60 = lin_reg_60.predict(onehot_feat_test_60)

# Print RMSE on test set
print('60-day RMSE:', root_mean_squared_error(y_true=y_test_60, y_pred=y_test_pred_lin_reg_60))

# Pring ROC AUC 
print('60-day ROCAUC:', roc_auc_score(y_true=y_test_60, y_score=y_test_pred_lin_reg_60))
60-day RMSE: 0.46235875721937514
60-day ROCAUC: 0.7153921802834847

Linear Regression 90-day classification

In [139]:
lin_reg_90 = LinearRegression()

# Fit the model to the data
lin_reg_90.fit(onehot_feat_train_90, y_train_90.values.ravel())

# Test the model
y_test_pred_lin_reg_90 = lin_reg_90.predict(onehot_feat_test_90)

# Print RMSE on test set
print('90-day RMSE:', root_mean_squared_error(y_true=y_test_90, y_pred=y_test_pred_lin_reg_90))

# Pring ROC AUC 
print('90-day ROCAUC:', roc_auc_score(y_true=y_test_90, y_score=y_test_pred_lin_reg_90))
90-day RMSE: 0.4319479074255897
90-day ROCAUC: 0.7649228989570104

LASSO 30-day classification

In [140]:
log_reg_30 = LogisticRegression(penalty='l1', solver='liblinear')

# Fit the model to the data
log_reg_30.fit(onehot_feat_train_30, y_train_30.values.ravel())

# Test the model
y_test_pred_log_reg_30 = log_reg_30.predict(onehot_feat_test_30)

# Print RMSE on test set
print('30-day RMSE:', root_mean_squared_error(y_true=y_test_30, y_pred=y_test_pred_log_reg_30))

# Pring ROC AUC 
print('30-day ROCAUC:', roc_auc_score(y_true=y_test_30, y_score=y_test_pred_log_reg_30))
30-day RMSE: 0.4560147068104949
30-day ROCAUC: 0.5044815852030349

LASSO 60-day classification

In [141]:
log_reg_60 = LogisticRegression(penalty='l1', solver='liblinear')

# Fit the model to the data
log_reg_60.fit(onehot_feat_train_60, y_train_60.values.ravel())

# Test the model
y_test_pred_log_reg_60 = log_reg_60.predict(onehot_feat_test_60)

# Print RMSE on test set
print('60-day RMSE:', root_mean_squared_error(y_true=y_test_60, y_pred=y_test_pred_log_reg_60))

# Pring ROC AUC 
print('60-day ROCAUC:', roc_auc_score(y_true=y_test_60, y_score=y_test_pred_log_reg_60))
60-day RMSE: 0.5858925664041662
60-day ROCAUC: 0.6508819079471253

LASSO 90-day classification

In [142]:
log_reg_90 = LogisticRegression(penalty='l1', solver='liblinear')

# Fit the model to the data
log_reg_90.fit(onehot_feat_train_90, y_train_90.values.ravel())

# Test the model
y_test_pred_log_reg_90 = log_reg_90.predict(onehot_feat_test_90)

# Print RMSE on test set
print('90-day RMSE:', root_mean_squared_error(y_true=y_test_90, y_pred=y_test_pred_log_reg_90))

# Pring ROC AUC 
print('90-day ROCAUC:', roc_auc_score(y_true=y_test_90, y_score=y_test_pred_log_reg_90))
90-day RMSE: 0.5156975787932896
90-day ROCAUC: 0.6889793989967173

SHAP Values¶

In [143]:
import shap
shap.initjs()
No description has been provided for this image
In [144]:
# Ref: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html
# https://shap.readthedocs.io/en/latest/example_notebooks/tabular_examples/tree_based_models/Census%20income%20classification%20with%20LightGBM.html
explainer = shap.TreeExplainer(best_lgbmodel_30)
shap_values = explainer(X)
shap.force_plot(explainer.expected_value, shap_values.values[1, :], X.iloc[0, :])
Out[144]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [145]:
shap.summary_plot(shap_values, X)
No description has been provided for this image
In [146]:
# Ref: https://shap.readthedocs.io/en/latest/example_notebooks/tabular_examples/tree_based_models/Census%20income%20classification%20with%20LightGBM.html
for name in X_train_30.columns:
    shap.dependence_plot(name, shap_values.values, X)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]: